# Alex Gazmararian
# agazmararian@gmail.com

library(tidyverse)
library(tidylog)
library(janitor)
library(readxl)
library(here)
library(tidycensus)
library(sf)
library(terra)
library(exactextractr)

# Suppress common spatial warnings to prevent log clutter
options(warn = -1)  # Temporarily suppress warnings for spatial operations
on.exit(options(warn = 0))  # Restore warnings at the end

# Load election data
g <- suppressWarnings(read_xlsx(here("data", "input", "elections", "CQ_Voting_Pres_Gen_by_county_1920_2020.xlsx"), progress = FALSE))
g <- subset(g, select = c(RaceDate, State, Area, TotalVotes:DemVotesMajorPercent))
g$year <- as.numeric(substr(g$RaceDate, 0, 4))
g$RaceDate <- NULL

# --- FIPS Merge ---
states_exclude <- c("Alaska", "Puerto Rico", "American Samoa", "Guam", "Northern Mariana Islands", "U.S. Minor Outlying Islands", "U.S. Virgin Islands", "District of Columbia")
fips <- subset(fips_codes, !(state_name %in% states_exclude))
fips$county <- gsub(" County", "", fips$county)
fips$county <- gsub(" Parish", "", fips$county)
fips$county <- tolower(fips$county)
# Fix misc. county names
fips$county <- with(fips, ifelse(
  (state_name == "Virginia" & !county %in% c("bedford city", "charles city", "james city", "franklin city", "fairfax city", 
  "richmond city", "roanoke city", "norfolk city") |
    (state_name == "Missouri" & !county %in% c("kansas city", "st. louis city"))
  ), gsub(" city", "", county), county
))
fips[fips$state == "MS" & fips$county == "desoto", ]$county <- "de soto"
fips[fips$state == "TX" & fips$county == "dewitt", ]$county <- "de witt"
fips[fips$state == "IL" & fips$county == "dupage", ]$county <- "du page"
fips[fips$state == "ND" & fips$county == "lamoure", ]$county <- "la moure"
fips[fips$state == "IN" & fips$county == "laporte", ]$county <- "la porte"
fips[fips$state == "IL" & fips$county == "lasalle", ]$county <- "la salle"
fips[fips$state == "MD" & fips$county == "prince george's", ]$county <- "prince georges"
fips[fips$state == "MD" & fips$county == "queen anne's", ]$county <- "queen annes"
fips[fips$state == "MD" & fips$county == "st. mary's", ]$county <- "st. marys"

fips$fips <- with(fips, paste0(state_code, county_code))
fips$fips <- as.numeric(fips$fips)
fips$state_code <- NULL
fips$county_code <- NULL

# Prepare election data for merge
g$Area <- tolower(g$Area)
# Exclude special jurisdictions and metadata rows:
# - Non-geographic vote categories (special ballots, overseas, etc)
# - Kansas City, MO (reports separately from 2004, creates double-counting issue)
# - Metadata rows about vote adjustments
area_exclude <- c(
    "votes not reported by county", "special ballots", "special absentee",
    "federal absentees", "federal ballots", "overseas vote",
    "federal absentee", "absentee",
    "votes not included in the average elector vote",
    "kansas city"  # Special jurisdiction in MO, avoid double counting
  )

g <- filter(g, !Area %in% area_exclude & year == 2020)

# Clean vote data before aggregation - convert "N/A" strings to actual NA
g <- g %>%
  mutate(across(c(RepVotes, DemVotes, ThirdVotes, OtherVotes), ~ ifelse(.x == "N/A", NA, .x)))

# Aggregate DC
g <- g %>%
  mutate(Area = ifelse(State == "District of Columbia", "district of columbia", Area)) %>%
  group_by(State, Area) %>%
  summarize(across(c(TotalVotes, RepVotes, DemVotes, ThirdVotes, OtherVotes), ~ sum(as.numeric(.x), na.rm = TRUE))) %>%
  ungroup() %>%
  filter(TotalVotes != 0)

# --- Map Alaska election districts to counties ---
ak <- st_read(here("data", "input", "ak_district_map", "tl_2020_02_vtd20", "tl_2020_02_vtd20.shp"))
# Aggregate to House District
ak$district <- substr(ak$VTDST20, 1, 2)

# Clean geometries first to avoid issues
message("Cleaning Alaska geometries...")
ak <- st_make_valid(ak)
ak <- st_buffer(ak, dist = 0)
ak_districts <- ak %>%
  group_by(district, STATEFP20, COUNTYFP20) %>%
  summarise(.groups = "drop") %>%
  st_make_valid()
ak_districts$fips <- as.numeric(paste0(ak_districts$STATEFP20, ak_districts$COUNTYFP20))
# Load population data
pop <- rast(here("data", "input", "worldpop", "ppp_2020_1km_Aggregated.tif"))
if (crs(ak_districts) != crs(pop)) {
  ak_districts <- st_transform(ak_districts, crs(pop))
}
ak_districts$district <- as.numeric(ak_districts$district)
# Calculate votes per district
cty <- tigris::counties(cb = TRUE, year = 2020)
cty <- filter(cty, STATEFP == "02")
cty$fips <- as.numeric(paste0(cty$STATEFP, cty$COUNTYFP))
cty <- subset(cty, select = c(fips))
if (crs(cty) != crs(ak_districts)) {
  cty <- st_transform(cty, crs(ak_districts))
}
# Get votes in Alaska by district
votes.ak <- g %>%
  filter(State == "Alaska") %>%
  dplyr::select(Area, TotalVotes, RepVotes, DemVotes, ThirdVotes, OtherVotes) %>%
  mutate(district = as.numeric(gsub("election district ", "", Area))) %>%
  dplyr::select(district, TotalVotes, RepVotes, DemVotes, ThirdVotes, OtherVotes)

# Population-weighted vote share
ak_districts <- left_join(ak_districts, votes.ak, by = "district")
ak_districts$pop <- exact_extract(pop, ak_districts, fun = "sum")
ak_districts$pop.share <- ak_districts$pop / sum(ak_districts$pop)

# Aggregate to counties
message("Validating geometries...")
# Use buffer(0) on entire objects for Windows compatibility
cty <- st_buffer(cty, dist = 0)
cty <- st_make_valid(cty)
ak_districts <- st_buffer(ak_districts, dist = 0)
ak_districts <- st_make_valid(ak_districts)
message("Geometry validation complete")

election_county_intersection <- suppressWarnings(try(st_intersection(cty, ak_districts), silent = TRUE))

if(inherits(election_county_intersection, "try-error")) {
  message("Standard intersection failed. Trying alternative approach...")
  
  # Try using s2 spherical geometry (turn off if causing issues)
  sf_use_s2(FALSE)
  election_county_intersection <- suppressWarnings(try(st_intersection(cty, ak_districts), silent = TRUE))
  
  if(inherits(election_county_intersection, "try-error")) {
    message("Still failing. Using individual intersections...")
    # Alternative: use st_intersection with error handling for each pair
    intersection_list <- list()
    for(i in 1:nrow(cty)) {
      for(j in 1:nrow(ak_districts)) {
        tryCatch({
          intersection_result <- suppressWarnings(st_intersection(cty[i, ], ak_districts[j, ]))
          if(nrow(intersection_result) > 0) {
            intersection_list[[length(intersection_list) + 1]] <- intersection_result
          }
        }, error = function(e) {
          message("Skipping problematic intersection: county ", i, " with district ", j)
        })
      }
    }
    
    if(length(intersection_list) > 0) {
      election_county_intersection <- do.call(rbind, intersection_list)
    } else {
      stop("All intersections failed. Check geometry validity.")
    }
  }
  
  # Restore s2 setting
  sf_use_s2(TRUE)
}
election_county_intersection <- st_collection_extract(election_county_intersection, "POLYGON")
election_county_intersection$pop.inter <- exact_extract(pop, election_county_intersection, "sum")

ak.out <- election_county_intersection %>%
  group_by(fips) %>%
  summarize(
    RepVotes = sum(RepVotes * (pop.inter / pop), na.rm = TRUE),
    DemVotes = sum(DemVotes * (pop.inter / pop), na.rm = TRUE),
    ThirdVotes = sum(ThirdVotes * (pop.inter / pop), na.rm = TRUE),
    OtherVotes = sum(OtherVotes * (pop.inter / pop), na.rm = TRUE),
    TotalVotes = sum(TotalVotes * (pop.inter / pop), na.rm = TRUE)
  ) %>%
  st_drop_geometry()

# Drop original Alaska to replace with population-weighted values
g <- filter(g, State != "Alaska")

gm <- subset(g, select = c(State, Area))
gm <- unique(gm)

# Add DC state to FIPS Data
fips <- fips %>%
  bind_rows(data.frame(county = "district of columbia", state_name = "District of Columbia", fips = 11001)) %>%
  distinct()

# Remove any rows with NA values before checking matches
gm <- gm %>% filter(!is.na(State) & !is.na(Area))

# Handle special cases before matching
unmatched.counties <- anti_join(gm, fips, by = c("Area" = "county", "State" = "state_name"))
if (nrow(unmatched.counties) > 0) {
  message("Found ", nrow(unmatched.counties), " unmatched counties:")
  print(unmatched.counties, n = Inf)
}

gm <- left_join(gm, fips, by = c("Area" = "county", "State" = "state_name"))
pres_elec_fips <- left_join(g, gm, by = c("Area", "State"))
pres_elec_fips <- bind_rows(pres_elec_fips, ak.out)

# Check for duplicate column names and clean them
if(any(duplicated(names(pres_elec_fips)))) {
  message("Found duplicate column names, cleaning...")
}

# Calculate vote percentages
pres_elec_fips <- pres_elec_fips %>%
  mutate(
    demshare = DemVotes / TotalVotes * 100,
    repshare = RepVotes / TotalVotes * 100,
    demshare_major = DemVotes / (DemVotes + RepVotes) * 100,
    repshare_major = RepVotes / (DemVotes + RepVotes) * 100
  )

pres_elec_fips <- filter(pres_elec_fips, !is.na(fips))
pres_elec_fips <- pres_elec_fips %>%
  dplyr::select(fips, TotalVotes, RepVotes, DemVotes, ThirdVotes, OtherVotes, demshare, repshare, demshare_major, repshare_major) %>%
  rename_all(~ tolower(.))

write_csv(pres_elec_fips, here("data", "inter", "pres_elections_2020.csv"))
print("\nFull dataset saved to pres_elections_2020.csv")